library(NbClust)
library(factoextra)
library(ClusterR)
library(ggplot2)
library(scatterplot3d)
library(dplyr)
library(stringr)
library(VIM)
library(fpc)
library(plotly)
library(clValid)
data1 <- read.csv('Data1.csv')
data2 <- read.csv('Data2.csv')
data3 <- read.csv('Data3.csv')
data4 <- read.csv('Data4.csv')
data5 <- read.csv('Data5.csv')
data6 <- read.csv('Data6.csv')
data7 <- read.csv('Data7.csv')
data8 <- read.csv('Data8.csv')
world_data <- read.csv('World Indicators.csv')
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data1[,2:4], kmeans, method = 'wss')
fviz_nbclust(data1[,2:4], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 6
#Setting k = 6 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
km <- kmeans(data1[, 2:4], i, nstart = 20)
ch <- round(calinhara(data1[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "4 98.68"
## [1] "5 146.82"
## [1] "6 262.71"
## [1] "7 225.1"
## [1] "8 478.81"
CH value for k = 7 is greater than CH value for k = 6, so we select k = 6
km <- kmeans(data1[, 2:4], 7, nstart = 20)
data1$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data1$Class, clusters = data1$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1, purity = 1 for our clustering solution
dist_matrix <- as.matrix(dist(data1[,2:4]))
hc.single <- hclust(dist(data1[,2:4]), method = 'single')
plot(hc.single)
From dendrogram, we can set number of clusters = 7
data1$h_cluster <- cutree(hc.single, 7)
external_validation(true_labels = data1$Class, clusters = data1$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1 and purity = 1 for our hierarchichal clustering
#3d plotting according to actual class
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data2[,2:4], kmeans, method = 'wss')
fviz_nbclust(data2[,2:4], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 4
#Setting k = 6 from silhouette recommendation
k <- 4
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
km <- kmeans(data2[, 2:4], i, nstart = 20)
ch <- round(calinhara(data2[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "2 411.4"
## [1] "3 487.11"
## [1] "4 677.98"
## [1] "5 697.03"
## [1] "6 682.36"
CH value for k = 5 is greater than k = 4, external validation (jaccard_index) is also higher for k=5
k <- 5
km <- kmeans(data2[, 2:4], k, nstart = 20)
data2$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data2$Class, clusters = data2$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 0.9876
## entropy : 0.3722
## normalized mutual information : 0.7849
## variation of information : 0.8177
## normalized var. of information : 0.354
## ----------------------------------------
## specificity : 0.9933
## sensitivity : 0.5917
## precision : 0.9808
## recall : 0.5917
## F-measure : 0.7381
## ----------------------------------------
## accuracy OR rand-index : 0.8463
## adjusted-rand-index : 0.6385
## jaccard-index : 0.5849
## fowlkes-mallows-index : 0.7618
## mirkin-metric : 25030
## ----------------------------------------
## [1] 0.5849363
Jaccard index = 0.58, purity = 0.98 for our clustering solution
dist_matrix <- as.matrix(dist(data2[,2:4]))
hc.single <- hclust(dist(data2[,2:4]), method = 'centroid')
plot(hc.single)
> Using “method” as “centroid” gives better clustering results than “single”. Observing the dengrogram, we can cut the dendrogram to create 4 clusters
data2$h_cluster <- cutree(hc.single, 4)
external_validation(true_labels = data2$Class, clusters = data2$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1 and purity = 1 for our hierarchichal clustering
#3d plotting according to actual class
fig <- plot_ly(data2, x = ~X, y = ~Y, z = ~C, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X'),
yaxis = list(title = 'Y'),
zaxis = list(title = 'C')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data2, x = ~X, y = ~Y, z = ~C, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X'),
yaxis = list(title = 'Y'),
zaxis = list(title = 'C')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data2, x = ~X, y = ~Y, z = ~C, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X'),
yaxis = list(title = 'Y'),
zaxis = list(title = 'C')))
fig
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data3[,2:4], kmeans, method = 'wss')
fviz_nbclust(data3[,2:4], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 4
#Setting k = 6 from silhouette recommendation
k <- 4
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
km <- kmeans(data3[, 2:4], i, nstart = 20)
ch <- round(calinhara(data3[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "2 137.95"
## [1] "3 204.3"
## [1] "4 418.39"
## [1] "5 344.24"
## [1] "6 303.4"
CH value for k = 4 highest
k <- 4
km <- kmeans(data3[, 2:4], k, nstart = 20)
data3$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data3$Class, clusters = data3$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1, purity = 1 for our clustering solution. We have perfect clustering
dist_matrix <- as.matrix(dist(data3[,2:4]))
hc.single <- hclust(dist(data3[,2:4]), method = 'centroid')
plot(hc.single)
> Using “method” as “centroid” gives better clustering results than “single”. Observing the dengrogram, we can cut the dendrogram to create 4 clusters
data3$h_cluster <- cutree(hc.single, 4)
external_validation(true_labels = data3$Class, clusters = data3$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 0.9875
## entropy : 0.0358
## normalized mutual information : 0.964
## variation of information : 0.1441
## normalized var. of information : 0.0696
## ----------------------------------------
## specificity : 0.9917
## sensitivity : 0.976
## precision : 0.9748
## recall : 0.976
## F-measure : 0.9754
## ----------------------------------------
## accuracy OR rand-index : 0.9878
## adjusted-rand-index : 0.9673
## jaccard-index : 0.952
## fowlkes-mallows-index : 0.9754
## mirkin-metric : 1950
## ----------------------------------------
## [1] 0.9519704
Jaccard index = 95 and purity = 0.98 for our hierarchichal clustering
#3d plotting according to actual class
fig <- plot_ly(data3, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data3, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data3, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
A few points are misclustered in cluster 3 and 2 for hierarchical clustering
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data4[,2:4], kmeans, method = 'wss')
fviz_nbclust(data4[,2:4], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 4
#Setting k = 8 from silhouette recommendation
k <- 8
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-6):(k+5)){
km <- kmeans(data4[, 2:4], i, nstart = 20)
ch <- round(calinhara(data4[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "2 730.57"
## [1] "3 588.92"
## [1] "4 615.13"
## [1] "5 664.76"
## [1] "6 711.54"
## [1] "7 799.18"
## [1] "8 918"
## [1] "9 967.82"
## [1] "10 1070.31"
## [1] "11 1153.38"
## [1] "12 1296.86"
## [1] "13 1385.65"
CH value keeps increasing, CH value for k=2 (actual k) is significantly less. Keeping k = 8
k <- 2
km <- kmeans(data4[, 2:4], k, nstart = 20)
data4$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data4$Class, clusters = data4$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 0.653
## entropy : 0.9314
## normalized mutual information : 0.0686
## variation of information : 1.8627
## normalized var. of information : 0.9645
## ----------------------------------------
## specificity : 0.5468
## sensitivity : 0.5459
## precision : 0.5459
## recall : 0.5459
## F-measure : 0.5459
## ----------------------------------------
## accuracy OR rand-index : 0.5464
## adjusted-rand-index : 0.0927
## jaccard-index : 0.3754
## fowlkes-mallows-index : 0.5459
## mirkin-metric : 453182
## ----------------------------------------
## [1] 0.3754314
Jaccard index = 0.25, purity = 1 for our clustering solution, which suggests we have many clusters that have high cohesion, but overall clustering is not that good. For k = 2, we have a higher CH (0.37) and lower purity (0.65).
dist_matrix <- as.matrix(dist(data4[,2:4]))
hc.single <- hclust(dist(data4[,2:4]), method = 'single')
plot(hc.single)
> Using “method” as “single” gives better clustering results than “centroid” in this example. Observing the dengrogram, we can cut the dendrogram to create 2 clusters.
data4$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data4$Class, clusters = data4$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1 and purity = 1 for our hierarchichal clustering, which is perfect clustering and significantly better than K-means. Hierarchical clustering outperforms K-means here.
#3d plotting according to actual class
fig <- plot_ly(data4, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data4, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data4, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data5[,2:4], kmeans, method = 'wss')
fviz_nbclust(data5[,2:4], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 10
#Setting k = 8 from silhouette recommendation
k <- 10
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-8):(k+5)){
km <- kmeans(data5[, 2:4], i, nstart = 20)
ch <- round(calinhara(data5[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "2 273.1"
## [1] "3 294.1"
## [1] "4 286.54"
## [1] "5 308.17"
## [1] "6 339.2"
## [1] "7 375.92"
## [1] "8 379.02"
## [1] "9 382.53"
## [1] "10 398.09"
## [1] "11 395.28"
## [1] "12 409.99"
## [1] "13 383.8"
## [1] "14 368.75"
## [1] "15 370.91"
Like in Dataset 4, CH value keeps increasing, CH value for k=2 (actual k) is significantly less. Keeping k = 10
k <- 10
km <- kmeans(data5[, 2:4], k, nstart = 20)
data5$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data5$Class, clusters = data5$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 1.5496
## normalized mutual information : 0.5634
## variation of information : 1.5496
## normalized var. of information : 0.6078
## ----------------------------------------
## specificity : 1
## sensitivity : 0.5602
## precision : 1
## recall : 0.5602
## F-measure : 0.7181
## ----------------------------------------
## accuracy OR rand-index : 0.7804
## adjusted-rand-index : 0.5605
## jaccard-index : 0.5602
## fowlkes-mallows-index : 0.7485
## mirkin-metric : 140380
## ----------------------------------------
## [1] 0.560213
Jaccard index = 0.55, purity = 1 for our clustering solution, which suggests we have many clusters that have high cohesion, but overall clustering is not that good. For k = 2, we have a lower jaccard index (0.48) and lower purity (0.71). This is due to structure of data and kmeans is not able to find proper clustering in this dataset which will be clear from visualisation
dist_matrix <- as.matrix(dist(data5[,2:4]))
hc.single <- hclust(dist(data5[,2:4]), method = 'single')
plot(hc.single)
> Using “method” as “single” gives better clustering results than “centroid” in this example. Observing the dengrogram, we can cut the dendrogram to create 2 clusters.
data5$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data5$Class, clusters = data5$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1 and purity = 1 for our hierarchichal clustering, which is perfect clustering and significantly better than K-means. Hierarchical clustering outperforms K-means here.
#3d plotting according to actual class
fig <- plot_ly(data5, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data5, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data5, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data6[,2:3], kmeans, method = 'wss')
fviz_nbclust(data6[,2:3], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 3
#Setting k = 3 from silhouette recommendation
k <- 3
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-1):(k+2)){
km <- kmeans(data6[, 2:3], i, nstart = 20)
ch <- round(calinhara(data6[, 2:3],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "2 3191.71"
## [1] "3 3981.74"
## [1] "4 3706.8"
## [1] "5 3713.51"
CH value for K = 3 is greater than K=2
k <- 2
km <- kmeans(data6[, 2:3], k, nstart = 20)
data6$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data6$Class, clusters = data6$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 0.9514
## entropy : 0.2691
## normalized mutual information : 0.7296
## variation of information : 0.5403
## normalized var. of information : 0.4257
## ----------------------------------------
## specificity : 0.9062
## sensitivity : 0.9089
## precision : 0.9064
## recall : 0.9089
## F-measure : 0.9076
## ----------------------------------------
## accuracy OR rand-index : 0.9075
## adjusted-rand-index : 0.8151
## jaccard-index : 0.8309
## fowlkes-mallows-index : 0.9076
## mirkin-metric : 1551006
## ----------------------------------------
## [1] 0.8308878
Jaccard index = 0.59, purity = 0.93 for k = 3, For k = 2, we have a higher jaccard index (0.83) and higher purity (0.95), so we keep k = 2
dist_matrix <- as.matrix(dist(data6[,2:3]))
hc.single <- hclust(dist(data6[,2:3]), method = 'complete')
plot(hc.single)
> Observing the dengrogram, we can cut the dendrogram to create 2 clusters.
data6$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data6$Class, clusters = data6$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 0.6008
## entropy : 0.3747
## normalized mutual information : 0.1394
## variation of information : 1.2717
## normalized var. of information : 0.9251
## ----------------------------------------
## specificity : 0.2048
## sensitivity : 0.8358
## precision : 0.5123
## recall : 0.8358
## F-measure : 0.6353
## ----------------------------------------
## accuracy OR rand-index : 0.5202
## adjusted-rand-index : 0.0406
## jaccard-index : 0.4655
## fowlkes-mallows-index : 0.6544
## mirkin-metric : 8047470
## ----------------------------------------
## [1] 0.4654753
Jaccard index = 0.49 and purity = 0.5 for our hierarchichal clustering,
ggplot(data6,
aes(x = X1,
y = X2,
color = as.factor(Class))) +
geom_point()
ggplot(data6,
aes(x = X1,
y = X2,
color = as.factor(kmeans_cluster))) +
geom_point()
ggplot(data6,
aes(x = X1,
y = X2,
color = as.factor(h_cluster))) +
geom_point()
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data7[,2:3], kmeans, method = 'wss')
fviz_nbclust(data7[,2:3], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 6
#Setting k = 3 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
km <- kmeans(data6[, 2:3], i, nstart = 20, iter.max = 15)
ch <- round(calinhara(data6[, 2:3],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "4 3706.8"
## [1] "5 3713.51"
## [1] "6 3675.35"
## [1] "7 3627.04"
## [1] "8 3541.43"
CH value for K = 6 is highest
k <- 6
km <- kmeans(data7[, 2:3], k, nstart = 20)
data7$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data7$Class, clusters = data7$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 0.9844
## entropy : 0.4208
## normalized mutual information : 0.6367
## variation of information : 1.1822
## normalized var. of information : 0.533
## ----------------------------------------
## specificity : 0.9943
## sensitivity : 0.6346
## precision : 0.9905
## recall : 0.6346
## F-measure : 0.7735
## ----------------------------------------
## accuracy OR rand-index : 0.8199
## adjusted-rand-index : 0.6355
## jaccard-index : 0.6307
## fowlkes-mallows-index : 0.7928
## mirkin-metric : 106658
## ----------------------------------------
## [1] 0.6307009
Jaccard index = 0.63, purity = 0.98 for k = 6
dist_matrix <- as.matrix(dist(data7[,2:3]))
hc.single <- hclust(dist(data7[,2:3]), method = 'single')
plot(hc.single)
> Observing the dengrogram, we can cut the dendrogram to create 6 clusters.
data7$h_cluster <- cutree(hc.single, 6)
external_validation(true_labels = data7$Class, clusters = data7$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1 and purity = 1 for our hierarchichal clustering, which is perfect clustering.From the visualisations we can see that Hierarchical performs much better than kmeans clustering.
ggplot(data7,
aes(x = X1,
y = X2,
color = as.factor(Class))) +
geom_point()
ggplot(data7,
aes(x = X1,
y = X2,
color = as.factor(kmeans_cluster))) +
geom_point()
ggplot(data7,
aes(x = X1,
y = X2,
color = as.factor(h_cluster))) +
geom_point()
# Dataset 8
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data8[,2:4], kmeans, method = 'wss')
fviz_nbclust(data8[,2:4], kmeans, method = 'silhouette')
## Warning: did not converge in 10 iterations
K suggested by elbow and silhouette method = 6
#Setting k = 6 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-5):(k+5)){
km <- kmeans(data8[, 2:4], i, nstart = 20)
ch <- round(calinhara(data8[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "1 NaN"
## [1] "2 1352.03"
## [1] "3 1485.84"
## [1] "4 1665.55"
## [1] "5 1673.8"
## [1] "6 1811.06"
## [1] "7 1788.68"
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## [1] "8 1823.08"
## [1] "9 1834.67"
## [1] "10 1842.62"
## [1] "11 1846.98"
Keeping k = 6
k <- 6
km <- kmeans(data8[, 2:4], k, nstart = 20, iter.max=15)
data8$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data8$Class, clusters = data8$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : Inf
## normalized mutual information : 0
## variation of information : 2.585
## normalized var. of information : 1
## ----------------------------------------
## specificity : NaN
## sensitivity : 0.1665
## precision : 1
## recall : 0.1665
## F-measure : 0.2854
## ----------------------------------------
## accuracy OR rand-index : 0.1665
## adjusted-rand-index : 0
## jaccard-index : 0.1665
## fowlkes-mallows-index : 0.408
## mirkin-metric : 13346658
## ----------------------------------------
## [1] 0.1664591
Jaccard index = 0.16, purity = 1 for our clustering solution, which suggests we have many clusters that have high cohesion, but overall clustering is not that good. This is due to structure of data and kmeans is not able to find proper clustering in this dataset which will be clear from visualisation, as the data has only 1 class.
dist_matrix <- as.matrix(dist(data8[,2:4]))
hc.single <- hclust(dist(data8[,2:4]), method = 'single')
plot(hc.single)
> Observing the dengrogram, we can cut the dendrogram to create 2 clusters.
data8$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data8$Class, clusters = data8$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : Inf
## normalized mutual information : 0
## variation of information : 0.0034
## normalized var. of information : 1
## ----------------------------------------
## specificity : NaN
## sensitivity : 0.9995
## precision : 1
## recall : 0.9995
## F-measure : 0.9998
## ----------------------------------------
## accuracy OR rand-index : 0.9995
## adjusted-rand-index : 0
## jaccard-index : 0.9995
## fowlkes-mallows-index : 0.9998
## mirkin-metric : 8002
## ----------------------------------------
## [1] 0.9995002
Jaccard index = 0.99 and purity = 1 for our hierarchichal clustering, which significantly better than K-means. Hierarchical clustering outperforms K-means here.
#3d plotting according to actual class
fig <- plot_ly(data8, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data8, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data8, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
summary(world_data)
## Birth.Rate Business.Tax.Rate Days.to.Start.Business Energy.Usage
## Min. :0.00800 Length:208 Min. : 1.00 Min. : 765
## 1st Qu.:0.01200 Class :character 1st Qu.: 9.00 1st Qu.: 5699
## Median :0.01900 Mode :character Median : 18.00 Median : 18082
## Mean :0.02176 Mean : 30.33 Mean : 92238
## 3rd Qu.:0.02800 3rd Qu.: 35.00 3rd Qu.: 51557
## Max. :0.05000 Max. :694.00 Max. :2727728
## NA's :9 NA's :27 NA's :72
## GDP Health.Exp...GDP Health.Exp.Capita Hours.to.do.Tax
## Length:208 Min. :0.01700 Length:208 Min. : 12.0
## Class :character 1st Qu.:0.04700 Class :character 1st Qu.: 151.8
## Mode :character Median :0.06500 Mode :character Median : 224.0
## Mean :0.06736 Mean : 279.3
## 3rd Qu.:0.08500 3rd Qu.: 327.0
## Max. :0.17700 Max. :2600.0
## NA's :23 NA's :28
## Infant.Mortality.Rate Internet.Usage Lending.Interest Life.Expectancy.Female
## Min. :0.00200 Min. :0.0000 Min. :0.0050 Min. :45.00
## 1st Qu.:0.00700 1st Qu.:0.1000 1st Qu.:0.0745 1st Qu.:67.00
## Median :0.01600 Median :0.4000 Median :0.1060 Median :76.00
## Mean :0.02734 Mean :0.3769 Mean :0.1226 Mean :72.75
## 3rd Qu.:0.04225 3rd Qu.:0.6000 3rd Qu.:0.1500 3rd Qu.:80.00
## Max. :0.11200 Max. :0.9000 Max. :0.5250 Max. :87.00
## NA's :20 NA's :9 NA's :77 NA's :11
## Life.Expectancy.Male Mobile.Phone.Usage Population.0.14 Population.15.64
## Min. :45.00 Min. :0.0000 Min. :0.1190 Min. :0.4750
## 1st Qu.:62.00 1st Qu.:0.7000 1st Qu.:0.1890 1st Qu.:0.5810
## Median :70.00 Median :1.0000 Median :0.2790 Median :0.6540
## Mean :68.04 Mean :0.9587 Mean :0.2870 Mean :0.6355
## 3rd Qu.:75.00 3rd Qu.:1.2000 3rd Qu.:0.3825 3rd Qu.:0.6845
## Max. :81.00 Max. :2.5000 Max. :0.4990 Max. :0.8570
## NA's :11 NA's :12 NA's :17 NA's :17
## Population.65. Population.Urban Region Country
## Min. :0.00300 Min. :0.0890 Length:208 Length:208
## 1st Qu.:0.03400 1st Qu.:0.3777 Class :character Class :character
## Median :0.05600 Median :0.5715 Mode :character Mode :character
## Mean :0.07742 Mean :0.5767
## 3rd Qu.:0.11800 3rd Qu.:0.7725
## Max. :0.23700 Max. :1.0000
## NA's :17 NA's :2
#Remove commas and $ from GDP and Capita columns
world_data$GDP <- as.factor(gsub(",", "", world_data$GDP))
world_data$GDP <- as.numeric(gsub("\\$", "", world_data$GDP))
world_data$Health.Exp.Capita <- as.factor(gsub(",", "", world_data$Health.Exp.Capita))
world_data$Health.Exp.Capita <- as.numeric(gsub("\\$", "", world_data$Health.Exp.Capita))
#Remove % from Tax rate column
world_data$Business.Tax.Rate <- as.numeric(sub("%", "",world_data$Business.Tax.Rate))
scaled_world_data <- scale(world_data[, 1:18])
scaled_world_data_2 <- world_data[,1:18] %>%
mutate_all(~(scale(.) %>% as.vector))
scaled_world_data_2$Region <- world_data$Region
scaled_world_data_2$Country <- world_data$Country
#Remove energy usage and lending interest as they have high number of NAs
scaled_world_data_2 <- scaled_world_data_2 %>%
select(-Energy.Usage, -Lending.Interest)
scaled_world_data_NA <- na.omit(scaled_world_data_2)
Check elbow method and silhouette
set.seed(10)
fviz_nbclust(scaled_world_data_NA[,1:16], kmeans, method = 'wss')
fviz_nbclust(scaled_world_data_NA[,1:16], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 2
#Setting k = 2 from silhouette recommendation
k <- 2
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k):(k+4)){
km <- kmeans(scaled_world_data_NA[,1:16], i, nstart = 20)
ch <- round(calinhara(scaled_world_data_NA[,1:16],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "2 106.99"
## [1] "3 80.83"
## [1] "4 66.44"
## [1] "5 60.69"
## [1] "6 58.33"
CH value is highest for K = 2, so we select K = 2
dist_matrix <- as.matrix(dist(scaled_world_data_NA[,1:16]))
hc.single <- hclust(dist(scaled_world_data_NA[,1:16]), method = 'single')
plot(hc.single)
> Observing the dengrogram, we can cut the dendrogram to create 2 clusters.
scaled_world_data_NA$h_cluster <- cutree(hc.single, 2)